找传奇、传世资源到传世资源站!

WebMagic 0.4.0 发布,Java爬虫框架

8.5玩家评分(1人评分)
下载后可评
介绍 评论 失效链接反馈

【例子介绍】WebMagic 0.4.0 发布,Java爬虫框架
修复0 3 2及之前版本连接池不生效的问题 #30 使用HttpClient 4 3 1新的连接池机制 实现连接复用功能 经测试 下载速度可达到90%左右的提升 测试代码:Kr36NewsModel java 二 增加同步抓取的API 对于小规模的抓取...

【相关图片】

from clipboard

【源码结构】
文件清单
└── webmagic-master
    ├── en_docs
    │   └── README.md
    ├── pom.xml
    ├── README.md
    ├── release-note.md
    ├── user-manual.md
    ├── webmagic-core
    │   ├── pom.xml
    │   ├── README.md
    │   └── src
    │       ├── main
    │       │   ├── java
    │       │   │   └── us
    │       │   │       └── codecraft
    │       │   │           └── webmagic
    │       │   │               ├── downloader
    │       │   │               │   ├── Downloader.java
    │       │   │               │   ├── HttpClientDownloader.java
    │       │   │               │   ├── HttpClientGenerator.java
    │       │   │               │   └── package.html
    │       │   │               ├── package.html
    │       │   │               ├── Page.java
    │       │   │               ├── pipeline
    │       │   │               │   ├── CollectorPipeline.java
    │       │   │               │   ├── ConsolePipeline.java
    │       │   │               │   ├── FilePipeline.java
    │       │   │               │   ├── package.html
    │       │   │               │   ├── Pipeline.java
    │       │   │               │   └── ResultItemsCollectorPipeline.java
    │       │   │               ├── processor
    │       │   │               │   ├── example
    │       │   │               │   │   ├── BaiduBaikePageProcesser.java
    │       │   │               │   │   ├── GithubRepoPageProcesser.java
    │       │   │               │   │   └── OschinaBlogPageProcesser.java
    │       │   │               │   ├── package.html
    │       │   │               │   ├── PageProcessor.java
    │       │   │               │   └── SimplePageProcessor.java
    │       │   │               ├── Request.java
    │       │   │               ├── ResultItems.java
    │       │   │               ├── scheduler
    │       │   │               │   ├── package.html
    │       │   │               │   ├── PriorityScheduler.java
    │       │   │               │   ├── QueueScheduler.java
    │       │   │               │   └── Scheduler.java
    │       │   │               ├── selector
    │       │   │               │   ├── AndSelector.java
    │       │   │               │   ├── BaseElementSelector.java
    │       │   │               │   ├── CssSelector.java
    │       │   │               │   ├── ElementSelector.java
    │       │   │               │   ├── Html.java
    │       │   │               │   ├── OrSelector.java
    │       │   │               │   ├── package.html
    │       │   │               │   ├── PlainText.java
    │       │   │               │   ├── RegexResult.java
    │       │   │               │   ├── RegexSelector.java
    │       │   │               │   ├── ReplaceSelector.java
    │       │   │               │   ├── Selectable.java
    │       │   │               │   ├── Selector.java
    │       │   │               │   ├── Selectors.java
    │       │   │               │   ├── SmartContentSelector.java
    │       │   │               │   ├── XpathSelector.java
    │       │   │               │   └── XsoupSelector.java
    │       │   │               ├── Site.java
    │       │   │               ├── Spider.java
    │       │   │               ├── Task.java
    │       │   │               └── utils
    │       │   │                   ├── EnvironmentUtil.java
    │       │   │                   ├── Experimental.java
    │       │   │                   ├── FilePersistentBase.java
    │       │   │                   ├── NumberUtils.java
    │       │   │                   ├── package.html
    │       │   │                   ├── ThreadUtils.java
    │       │   │                   └── UrlUtils.java
    │       │   └── resources
    │       │       └── log4j.xml
    │       └── test
    │           ├── java
    │           │   └── us
    │           │       └── codecraft
    │           │           └── webmagic
    │           │               ├── downloader
    │           │               │   └── HttpClientDownloaderTest.java
    │           │               ├── HtmlTest.java
    │           │               ├── scheduler
    │           │               │   └── PrioritySchedulerTest.java
    │           │               ├── selector
    │           │               │   ├── ExtractorsTest.java
    │           │               │   └── RegexSelectorTest.java
    │           │               ├── SpiderTest.java
    │           │               └── utils
    │           │                   ├── EnvironmentUtilTest.java
    │           │                   └── UrlUtilsTest.java
    │           └── resources
    │               └── log4j.xml
    ├── webmagic-extension
    │   ├── pom.xml
    │   ├── README.md
    │   └── src
    │       ├── main
    │       │   └── java
    │       │       └── us
    │       │           └── codecraft
    │       │               └── webmagic
    │       │                   ├── downloader
    │       │                   │   └── FileCache.java
    │       │                   ├── example
    │       │                   │   ├── BaiduBaike.java
    │       │                   │   ├── GithubRepo.java
    │       │                   │   └── OschinaBlog.java
    │       │                   ├── model
    │       │                   │   ├── AfterExtractor.java
    │       │                   │   ├── annotation
    │       │                   │   │   ├── ComboExtract.java
    │       │                   │   │   ├── ExtractBy.java
    │       │                   │   │   ├── ExtractByUrl.java
    │       │                   │   │   ├── Formatter.java
    │       │                   │   │   ├── HelpUrl.java
    │       │                   │   │   ├── package.html
    │       │                   │   │   └── TargetUrl.java
    │       │                   │   ├── ConsolePageModelPipeline.java
    │       │                   │   ├── Extractor.java
    │       │                   │   ├── FieldExtractor.java
    │       │                   │   ├── formatter
    │       │                   │   │   ├── BasicTypeFormatter.java
    │       │                   │   │   ├── DateFormatter.java
    │       │                   │   │   ├── ObjectFormatter.java
    │       │                   │   │   └── ObjectFormatters.java
    │       │                   │   ├── HasKey.java
    │       │                   │   ├── ModelPageProcessor.java
    │       │                   │   ├── ModelPipeline.java
    │       │                   │   ├── OOSpider.java
    │       │                   │   ├── package.html
    │       │                   │   ├── PageModelCollectorPipeline.java
    │       │                   │   └── PageModelExtractor.java
    │       │                   ├── MultiPageModel.java
    │       │                   ├── pipeline
    │       │                   │   ├── CollectorPageModelPipeline.java
    │       │                   │   ├── FilePageModelPipeline.java
    │       │                   │   ├── JsonFilePageModelPipeline.java
    │       │                   │   ├── JsonFilePipeline.java
    │       │                   │   ├── MultiPagePipeline.java
    │       │                   │   └── PageModelPipeline.java
    │       │                   ├── scheduler
    │       │                   │   ├── FileCacheQueueScheduler.java
    │       │                   │   └── RedisScheduler.java
    │       │                   ├── selector
    │       │                   │   └── JsonPathSelector.java
    │       │                   └── utils
    │       │                       ├── DoubleKeyMap.java
    │       │                       ├── ExtractorUtils.java
    │       │                       └── MultiKeyMapBase.java
    │       └── test
    │           ├── java
    │           │   └── us
    │           │       └── codecraft
    │           │           └── webmagic
    │           │               ├── downloader
    │           │               │   └── FileCacheTest.java
    │           │               ├── formatter
    │           │               │   └── DateFormatterTest.java
    │           │               ├── MockDownloader.java
    │           │               ├── MockPageModelPipeline.java
    │           │               ├── MockPipeline.java
    │           │               ├── model
    │           │               │   └── GithubRepoTest.java
    │           │               ├── processor
    │           │               │   └── GithubRepoProcessor.java
    │           │               ├── scheduler
    │           │               │   └── RedisSchedulerTest.java
    │           │               └── selector
    │           │                   └── JsonPathSelectorTest.java
    │           └── resouces
    │               └── log4j.xml
    ├── webmagic-lucene
    │   ├── pom.xml
    │   ├── README.md
    │   └── src
    │       └── main
    │           ├── java
    │           │   └── us
    │           │       └── codecraft
    │           │           └── webmagic
    │           │               └── pipeline
    │           │                   └── LucenePipeline.java
    │           └── test
    │               └── java
    │                   └── us
    │                       └── codecraft
    │                           └── webmagic
    │                               └── lucene
    │                                   └── OschinaBlog.java
    ├── webmagic-samples
    │   ├── pom.xml
    │   ├── README.md
    │   └── src
    │       ├── main
    │       │   ├── java
    │       │   │   └── us
    │       │   │       └── codecraft
    │       │   │           └── webmagic
    │       │   │               ├── main
    │       │   │               │   └── QuickStarter.java
    │       │   │               ├── model
    │       │   │               │   └── samples
    │       │   │               │       ├── Blog.java
    │       │   │               │       ├── GithubRepo.java
    │       │   │               │       ├── IteyeBlog.java
    │       │   │               │       ├── Kr36NewsModel.java
    │       │   │               │       ├── News163.java
    │       │   │               │       ├── OschinaAnswer.java
    │       │   │               │       └── OschinaBlog.java
    │       │   │               └── samples
    │       │   │                   ├── DiandianBlogProcessor.java
    │       │   │                   ├── HuxiuProcessor.java
    │       │   │                   ├── InfoQMiniBookProcessor.java
    │       │   │                   ├── IteyeBlogProcessor.java
    │       │   │                   ├── NjuBBSProcessor.java
    │       │   │                   ├── OschinaBlogPageProcesser.java
    │       │   │                   ├── OschinaPageProcesser.java
    │       │   │                   ├── QzoneBlogProcessor.java
    │       │   │                   ├── scheduler
    │       │   │                   │   ├── DelayQueueScheduler.java
    │       │   │                   │   ├── LevelLimitScheduler.java
    │       │   │                   │   └── ZipCodePageProcessor.java
    │       │   │                   ├── SinaBlogProcesser.java
    │       │   │                   └── TianyaPageProcesser.java
    │       │   └── resources
    │       │       └── log4j.xml
    │       └── test
    │           └── java
    │               └── us
    │                   └── codecraft
    │                       └── webmagic
    │                           ├── model
    │                           │   └── ProcessorBenchmark.java
    │                           ├── processor
    │                           │   └── SinablogProcessorTest.java
    │                           ├── samples
    │                           │   └── scheduler
    │                           │       └── DelayQueueSchedulerTest.java
    │                           └── SpiderTest.java
    ├── webmagic-saxon
    │   ├── pom.xml
    │   ├── README.md
    │   └── src
    │       ├── main
    │       │   └── java
    │       │       └── us
    │       │           └── codecraft
    │       │               └── webmagic
    │       │                   └── selector
    │       │                       └── Xpath2Selector.java
    │       └── test
    │           └── java
    │               └── us
    │                   └── codecraft
    │                       └── webmagic
    │                           └── selector
    │                               └── XpathSelectorTest.java
    ├── webmagic-selenium
    │   ├── pom.xml
    │   ├── README.md
    │   └── src
    │       ├── main
    │       │   └── java
    │       │       └── us
    │       │           └── codecraft
    │       │               └── webmagic
    │       │                   └── downloader
    │       │                       └── selenium
    │       │                           ├── SeleniumDownloader.java
    │       │                           └── WebDriverPool.java
    │       └── test
    │           └── java
    │               └── us
    │                   └── codecraft
    │                       └── webmagic
    │                           ├── downloader
    │                           │   ├── selenium
    │                           │   │   ├── SeleniumDownloaderTest.java
    │                           │   │   └── WebDriverPoolTest.java
    │                           │   └── SeleniumTest.java
    │                           └── samples
    │                               └── HuabanProcessor.java
    └── zh_docs
        ├── README.md
        └── us
            └── codecraft
                └── webmagic
                    ├── downloader
                    │   ├── Destroyable-cmnt.xml
                    │   ├── Downloader-cmnt.xml
                    │   ├── FileDownloader-cmnt.xml
                    │   ├── HttpClientDownloader-cmnt.xml
                    │   ├── HttpClientPool-cmnt.xml
                    │   └── package.cmnt
                    ├── model
                    │   ├── AfterExtractor-cmnt.xml
                    │   ├── annotation
                    │   │   ├── ComboExtract-cmnt.xml
                    │   │   ├── ExtractBy2-cmnt.xml
                    │   │   ├── ExtractBy2.Type-cmnt.xml
                    │   │   ├── ExtractBy3-cmnt.xml
                    │   │   ├── ExtractBy3.Type-cmnt.xml
                    │   │   ├── ExtractBy-cmnt.xml
                    │   │   ├── ExtractByRaw-cmnt.xml
                    │   │   ├── ExtractByRaw.Type-cmnt.xml
                    │   │   ├── ExtractBy.Type-cmnt.xml
                    │   │   ├── ExtractByUrl-cmnt.xml
                    │   │   ├── HelpUrl-cmnt.xml
                    │   │   ├── package.cmnt
                    │   │   └── TargetUrl-cmnt.xml
                    │   ├── ConsolePageModelPipeline-cmnt.xml
                    │   ├── HasKey-cmnt.xml
                    │   ├── OOSpider-cmnt.xml
                    │   ├── package.cmnt
                    │   └── PageModelPipeline-cmnt.xml
                    ├── package.cmnt
                    ├── Page-cmnt.xml
                    ├── PagedModel-cmnt.xml
                    ├── pipeline
                    │   ├── ConsolePipeline-cmnt.xml
                    │   ├── FilePipeline-cmnt.xml
                    │   ├── JsonFilePageModelPipeline-cmnt.xml
                    │   ├── JsonFilePipeline-cmnt.xml
                    │   ├── package.cmnt
                    │   ├── PagedPipeline-cmnt.xml
                    │   └── Pipeline-cmnt.xml
                    ├── processor
                    │   ├── package.cmnt
                    │   ├── PageProcessor-cmnt.xml
                    │   └── SimplePageProcessor-cmnt.xml
                    ├── Request-cmnt.xml
                    ├── ResultItems-cmnt.xml
                    ├── scheduler
                    │   ├── FileCacheQueueScheduler-cmnt.xml
                    │   ├── package.cmnt
                    │   ├── QueueScheduler-cmnt.xml
                    │   ├── RedisScheduler-cmnt.xml
                    │   └── Scheduler-cmnt.xml
                    ├── selector
                    │   ├── AndSelector-cmnt.xml
                    │   ├── CssSelector-cmnt.xml
                    │   ├── Html-cmnt.xml
                    │   ├── JsonPathSelector-cmnt.xml
                    │   ├── OrSelector-cmnt.xml
                    │   ├── package.cmnt
                    │   ├── PlainText-cmnt.xml
                    │   ├── RegexSelector-cmnt.xml
                    │   ├── ReplaceSelector-cmnt.xml
                    │   ├── Selectable-cmnt.xml
                    │   ├── Selector-cmnt.xml
                    │   ├── SelectorFactory-cmnt.xml
                    │   ├── SmartContentSelector-cmnt.xml
                    │   └── XpathSelector-cmnt.xml
                    ├── Site-cmnt.xml
                    ├── Spider-cmnt.xml
                    ├── Task-cmnt.xml
                    └── utils
                        ├── DoubleKeyMap-cmnt.xml
                        ├── FilePersistentBase-cmnt.xml
                        ├── MultiKeyMapBase-cmnt.xml
                        ├── package.cmnt
                        ├── ThreadUtils-cmnt.xml
                        └── UrlUtils-cmnt.xml

134 directories, 232 files

评论

发表评论必须先登陆, 您可以 登陆 或者 注册新账号 !


在线咨询: 问题反馈
客服QQ:174666394

有问题请留言,看到后及时答复